import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import folium
import random
from pulp import *
import json
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows",700)
from haversine import haversine
data = pd.read_csv("Boston Crime.csv", encoding = "ISO-8859-1", low_memory=False)
data['OCCURRED_ON_DATE']=pd.to_datetime(data['OCCURRED_ON_DATE'])
data.groupby(["INCIDENT_NUMBER"])["OFFENSE_CODE_GROUP"].count().sort_values(ascending=False).head()
data[data.INCIDENT_NUMBER=="I162030584"].head()
data_non_duplicates=data.drop_duplicates(subset="INCIDENT_NUMBER", keep="first").reset_index(drop=True)
data.info()
data.groupby("YEAR")["INCIDENT_NUMBER"].count()
df_2015=data[data.YEAR==2015]
print(df_2015["OCCURRED_ON_DATE"].min())
df_2018=data[data.YEAR==2018]
print(df_2018["OCCURRED_ON_DATE"].max())
print(dt.datetime.strptime("2016-1-1", '%Y-%m-%d') - dt.datetime.strptime("2015-6-15", '%Y-%m-%d'))
print(dt.datetime.strptime("2018-10-03", '%Y-%m-%d') - dt.datetime.strptime("2018-1-1", '%Y-%m-%d'))
crime_data=pd.DataFrame(data.groupby("YEAR")["INCIDENT_NUMBER"].count()).rename(columns={"INCIDENT_NUMBER":"CRIME_NUMBER"})
crime_data["Total_Day"]=[200,365,365,275]
crime_data["Daily_Average"]=crime_data["CRIME_NUMBER"]/crime_data["Total_Day"]
crime_data
sns.set()
p0=plt.figure(figsize=(7,6))
plt.title(r'2015-2018 Total')
plt.bar(range(crime_data.index.shape[0]),crime_data.loc[:,'CRIME_NUMBER'])
plt.xlabel('CRIME_NUMBERS')
plt.xlabel('Years')
plt.ylabel('Yearly Crime Numbers')
plt.xticks(range(crime_data.index.shape[0]),crime_data.index)
x=np.arange(crime_data.index.shape[0])
y=np.array(crime_data['CRIME_NUMBER'])
for i,j in zip(x,y):
plt.text(i,j,'%d'%j,ha='center')
plt.show()
sns.set()
p0=plt.figure(figsize=(7,6))
plt.title(r'2015-2018 Average')
plt.bar(range(crime_data.index.shape[0]),crime_data.loc[:,'Daily_Average'])
plt.xlabel('Years')
plt.ylabel('Daily Average of Crimes')
plt.xticks(range(crime_data.index.shape[0]),crime_data.index)
x=np.arange(crime_data.index.shape[0])
y=np.array(crime_data['Daily_Average'])
for i,j in zip(x,y):
plt.text(i,j,'%d'%j,ha='center')
plt.show()
Montly_Crime_Data=pd.DataFrame(data_non_duplicates.groupby(["YEAR","MONTH"])["INCIDENT_NUMBER"].count()).reset_index().rename(
columns={"INCIDENT_NUMBER":"Crime_Number"})
Montly_Crime_Data.head()
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(15,20)})
g = sns.catplot(
data=Montly_Crime_Data, kind="bar",
x="MONTH", y="Crime_Number", hue="YEAR",
ci="sd", palette="dark", alpha=.8, height=7, aspect=14/10
)
g.despine(left=True)
g.set_axis_labels("Months", "Total Crimes")
g.legend.set_title("Years")
most_freq_crimes=data.groupby("OFFENSE_CODE_GROUP")["INCIDENT_NUMBER"].count().sort_values(ascending=False)[:5].index
freq_crimes=data[data.OFFENSE_CODE_GROUP.isin(most_freq_crimes)].reset_index(drop=True)
freq_crimes=freq_crimes.dropna(subset = ['DISTRICT'])
pd.DataFrame(data.groupby("OFFENSE_CODE_GROUP")["INCIDENT_NUMBER"].count().sort_values(ascending=False)[:10]).reset_index().rename(columns=
{"INCIDENT_NUMBER":"INCIDENT_NUMBER_TOTAL"})
pd.DataFrame(freq_crimes.groupby(["DISTRICT"])["INCIDENT_NUMBER"].count()).reset_index()
freq_crimes_data=pd.merge(left=pd.DataFrame(freq_crimes.groupby(["OFFENSE_CODE_GROUP","DISTRICT"])["INCIDENT_NUMBER"].count()).reset_index(),
right=pd.DataFrame(data.groupby("OFFENSE_CODE_GROUP")["INCIDENT_NUMBER"].count().sort_values(ascending=False)[:5]).reset_index().rename(columns={"INCIDENT_NUMBER":"INCIDENT_NUMBER_TOTAL"}),
left_on="OFFENSE_CODE_GROUP",right_on="OFFENSE_CODE_GROUP",how="left")
freq_crimes_data=pd.merge(left=freq_crimes_data,right=pd.DataFrame(freq_crimes.groupby(["DISTRICT"])["INCIDENT_NUMBER"].count()).reset_index().rename(columns={"INCIDENT_NUMBER":"INCIDENT_NUMBER_DISTRICT"}),
left_on="DISTRICT",right_on="DISTRICT",how="left")
freq_crimes_data["Offense_Percentage"]=(freq_crimes_data["INCIDENT_NUMBER"]/freq_crimes_data["INCIDENT_NUMBER_TOTAL"])*100
freq_crimes_data["District_Percentage"]=(freq_crimes_data["INCIDENT_NUMBER"]/freq_crimes_data["INCIDENT_NUMBER_DISTRICT"])*100
freq_crimes_data=freq_crimes_data.drop(["INCIDENT_NUMBER_TOTAL","INCIDENT_NUMBER_DISTRICT"],1)
freq_crimes_data=freq_crimes_data.sort_values(["DISTRICT", "INCIDENT_NUMBER"], ascending = (True, False)).reset_index(drop=True)
freq_crimes_data.head(10)
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 16
fig = plt.figure(figsize = (8, 5))
# creating the bar plot
plt.bar(freq_crimes_data[freq_crimes_data.DISTRICT==freq_crimes_data["DISTRICT"].unique()[6]]["OFFENSE_CODE_GROUP"],
freq_crimes_data[freq_crimes_data.DISTRICT==freq_crimes_data["DISTRICT"].unique()[6]]["District_Percentage"], color ='maroon',width=.5)
plt.xticks(rotation = 90)
plt.xlabel("Offenses",fontsize=18)
plt.ylabel("Percentage Of Incidents")
plt.title(freq_crimes_data["DISTRICT"].unique()[10])
plt.show()
fig, axes = plt.subplots(figsize=(12, 34) , nrows = 6, ncols = 2)
#five categorical columns and three numerical columns of interest
for i, category in enumerate(freq_crimes_data["DISTRICT"].unique()):
ax = fig.add_subplot(6,2,i+1)
plt.bar(freq_crimes_data[freq_crimes_data.DISTRICT==category]["OFFENSE_CODE_GROUP"],
freq_crimes_data[freq_crimes_data.DISTRICT==category]["District_Percentage"])
plt.xticks(rotation = 30)
plt.title(category)
freq_crimes_data=freq_crimes_data.sort_values(["OFFENSE_CODE_GROUP", "Offense_Percentage"], ascending = (True, False)).reset_index(drop=True)
fig = plt.figure(figsize = (8, 5))
# creating the bar plot
plt.bar(freq_crimes_data[freq_crimes_data.OFFENSE_CODE_GROUP==freq_crimes_data["OFFENSE_CODE_GROUP"].unique()[4]]["DISTRICT"],
freq_crimes_data[freq_crimes_data.OFFENSE_CODE_GROUP==freq_crimes_data["OFFENSE_CODE_GROUP"].unique()[4]]["Offense_Percentage"], color ='olive',width=.6)
plt.xticks(rotation = 0)
plt.xlabel("Districts")
plt.ylabel("Percentage Of Incidents")
plt.title(freq_crimes_data["OFFENSE_CODE_GROUP"].unique()[4])
plt.show()
freq_crimes_data.head(20)
location_data=data[["INCIDENT_NUMBER","DISTRICT","YEAR","Lat","Long","Location"]]
location_data.isnull().sum()
location_data[location_data.Long.isnull()==True].head()
location_data=location_data.dropna(subset = ['Lat'])
location_data[location_data.DISTRICT.isnull()==True]
location_data["DISTRICT"]=location_data["DISTRICT"].fillna("No_Info")
location_data["Location"]=location_data["Location"].apply(lambda x:x.strip("()").strip(" "))
location_data["Location"]=location_data["Location"].apply(lambda x:str(x.split(" ")[0]+x.split(" ")[1]))
location_data=location_data[~location_data.Long.isin([-1])].reset_index(drop=True)
location_data=location_data[location_data.YEAR==2017].reset_index(drop=True)
location_data.head()
randomlist = random.sample(range(0, location_data.index.max()), 2500)
WHS_COORD = [location_data["Lat"][0],location_data["Long"][0]]
map_nyc = folium.Map(location=WHS_COORD, zoom_start=11, width=740, height=500)
test_colors=['purple', 'orange', 'darkred', 'lightred', 'yellow',
'darkblue', 'darkgreen', 'white', 'pink', 'lightblue', 'lightgreen', 'gray', 'black',]
my_colors = {location_data.DISTRICT.unique()[i]:test_colors[i] for i in range(len(location_data.DISTRICT.unique()))}
for i in randomlist:
my_color=my_colors[location_data["DISTRICT"][i]]
folium.CircleMarker([location_data["Lat"][i],location_data["Long"][i]], radius=5,color=my_color, fill_color='#0080bb',popup=location_data["DISTRICT"][i]).add_to(map_nyc)
map_nyc
A15_2016=data_non_duplicates[(data_non_duplicates["DISTRICT"]=="A15") & (data_non_duplicates["YEAR"]==2016)][["INCIDENT_NUMBER",
"Lat","Long"]].dropna().reset_index(drop=True)
A15_2016=A15_2016[~A15_2016.Lat.isin([-1])].reset_index(drop=True)
pd.concat([A15_2016.head(),A15_2016.tail()])
Potential_Station_Number=30
Station_ids=["ST"+str(x) for x in range(1,Potential_Station_Number+1)]
randomlist = random.sample(range(0, A15_2016.index.max()), len(Station_ids))
potential_stations=pd.DataFrame({"Station_id":Station_ids,
"Station_Lat":A15_2016.loc[randomlist,"Lat"].values.tolist(),
"Station_Long":A15_2016.loc[randomlist,"Long"].values.tolist()})
WHS_COORD = [A15_2016["Lat"][0],A15_2016["Long"][0]]
map_nyc = folium.Map(location=WHS_COORD, zoom_start=13, width=740, height=500)
for i in range(len(A15_2016)):
folium.CircleMarker([A15_2016["Lat"][i],A15_2016["Long"][i]], radius=5,color="red", fill_color='#0080bb',popup=A15_2016["INCIDENT_NUMBER"][i]).add_to(map_nyc)
for j in range(len(potential_stations)):
folium.Marker([potential_stations["Station_Lat"][j],potential_stations["Station_Long"][j]], radius=5,popup=potential_stations["Station_id"][j]).add_to(map_nyc)
map_nyc
N = A15_2016["INCIDENT_NUMBER"].nunique()
print("The dimension : " + str(N))
res = [list(range(1 + N * i, 1 + N * (i + 1)))
for i in range(N)]
res=pd.DataFrame(res)
res=res[:len(potential_stations)]
res.columns=A15_2016["INCIDENT_NUMBER"]
res[res > 0] = 0
res.index=potential_stations["Station_id"].values
potential_coords=[]
for j in range(len(potential_stations)):
potential_coords.append(str(potential_stations["Station_Lat"][j])+","+str(potential_stations["Station_Long"][j]))
demand_coords=[]
for j in range(len(A15_2016)):
demand_coords.append(str(A15_2016["Lat"][j])+","+str(A15_2016["Long"][j]))
res["potential_coords"]=potential_coords
demand_coords.append("0")
demand_coords=pd.DataFrame(demand_coords).T
demand_coords.columns=res.columns
res=res.append(demand_coords)
res
last_data=pd.DataFrame()
my_list=[]
for j in range(0,(len(res.columns)-1)):
for i in range(len(res)-1):
my_length=int(haversine([float(res.loc[res.index[i]]["potential_coords"].split(",")[0]),
float(res.loc[res.index[i]]["potential_coords"].split(",")[1])],
[float(res[res.columns[j]][0].split(",")[0]),float(res[res.columns[j]][0].split(",")[1])])*1000)
my_list.append(my_length)
last_data[res.columns[j]]=my_list
my_list=[]
last_data.index=potential_stations["Station_id"].values
last_data.head()
optimization_result=pd.DataFrame({"Station":[],"Total_Cost":[]})
for station in range(len(potential_stations)):
potentials=potential_stations["Station_id"].values.tolist()[station:station+1]
demands=A15_2016["INCIDENT_NUMBER"].values.tolist()
distance=last_data[station:station+1].to_dict('index')
prob = LpProblem("Transportation", LpMinimize)
routes =[(i,j) for i in potentials for j in demands]
amount_vars = LpVariable.dicts("X",(last_data[station:station+1].index.tolist(),demands),lowBound=0, upBound=1, cat='Binary')
prob += lpSum(amount_vars[i][j]*distance[i][j] for (i,j) in routes)
for j in demands:
prob += lpSum(amount_vars[i][j] for i in last_data[station:station+1].index.tolist()) == 1
prob.solve()
optimization_result=optimization_result.append(pd.DataFrame({"Station":potentials,"Total_Cost":value(prob.objective)}))
optimization_result=optimization_result.reset_index(drop=True)
optimization_result.head(10)
optimization_result[optimization_result["Total_Cost"]==optimization_result["Total_Cost"].min()]
best_station_index=optimization_result[optimization_result["Total_Cost"]==optimization_result["Total_Cost"].min()].index[0]
WHS_COORD = [A15_2016["Lat"][0],A15_2016["Long"][0]]
map_nyc = folium.Map(location=WHS_COORD, zoom_start=13, width=740, height=500)
for i in range(len(A15_2016)):
folium.CircleMarker([A15_2016["Lat"][i],A15_2016["Long"][i]], radius=5,color="red", fill_color='#0080bb',popup=A15_2016["INCIDENT_NUMBER"][i]).add_to(map_nyc)
folium.Marker([potential_stations["Station_Lat"][best_station_index],potential_stations["Station_Long"][best_station_index]], radius=5,popup=potential_stations["Station_id"][best_station_index]).add_to(map_nyc)
map_nyc
from itertools import combinations
def rSubset(arr, r):
return list(combinations(arr, r))
potential_stations=potential_stations[:15]
potential_stations=potential_stations[~potential_stations["Station_id"].isin(["ST12"])].reset_index()
total_station_number=3
optimization_result_3_stations=pd.DataFrame({"Station":[],"Total_Cost":[]})
for station in rSubset(potential_stations["Station_id"], total_station_number):
potentials=potential_stations[potential_stations["Station_id"].isin(station)]["Station_id"].values.tolist()
demands=A15_2016["INCIDENT_NUMBER"].values.tolist()
distance=last_data.loc[potentials,:].to_dict('index')
prob = LpProblem("Transportation", LpMinimize)
routes =[(i,j) for i in potentials for j in demands]
amount_vars = LpVariable.dicts("X",(potentials,demands),lowBound=0, upBound=1, cat='Binary')
prob += lpSum(amount_vars[i][j]*distance[i][j] for (i,j) in routes)
for j in demands:
prob += lpSum(amount_vars[i][j] for i in potentials) == 1
prob.solve()
optimization_result_3_stations=optimization_result_3_stations.append(pd.DataFrame({"Station":station,"Total_Cost":value(prob.objective)}))
optimization_result_3_stations=optimization_result_3_stations.reset_index(drop=True)
optimization_result_3_stations[optimization_result_3_stations["Total_Cost"]==optimization_result_3_stations["Total_Cost"].min()]["Station"].values.tolist()
WHS_COORD = [A15_2016["Lat"][0],A15_2016["Long"][0]]
map_nyc = folium.Map(location=WHS_COORD, zoom_start=13, width=740, height=500)
for i in range(len(A15_2016)):
folium.CircleMarker([A15_2016["Lat"][i],A15_2016["Long"][i]], radius=5,color="red", fill_color='#0080bb',popup=A15_2016["INCIDENT_NUMBER"][i]).add_to(map_nyc)
folium.Marker([potential_stations["Station_Lat"][0],potential_stations["Station_Long"][0]], radius=5,popup=potential_stations["Station_id"][0]).add_to(map_nyc)
folium.Marker([potential_stations["Station_Lat"][2],potential_stations["Station_Long"][2]], radius=5,popup=potential_stations["Station_id"][2]).add_to(map_nyc)
folium.Marker([potential_stations["Station_Lat"][3],potential_stations["Station_Long"][3]], radius=5,popup=potential_stations["Station_id"][3]).add_to(map_nyc)
map_nyc
year_2016=data_non_duplicates[data_non_duplicates.YEAR==2016].reset_index(drop=True)
year_2016["DAY"]=year_2016["OCCURRED_ON_DATE"].astype(str).apply(lambda x:x.split(" ")[0])
year_2016["HOUR"]=year_2016["OCCURRED_ON_DATE"].astype(str).apply(lambda x:x.split(" ")[1].split(":")[0])
sns.displot(year_2016.groupby(["DAY"])["INCIDENT_NUMBER"].count().values.tolist())
plt.title("Histogram of Daily Incident Numbers")
plt.show()
plt.figure(figsize=(8,6))
plt.title(r'Hourly Incidents')
sns.barplot(x="HOUR", y="INCIDENT_NUMBER", data=pd.DataFrame(year_2016.groupby(["HOUR"])["INCIDENT_NUMBER"].count()).reset_index())
plt.show()
year_2016_A1=year_2016[year_2016.DISTRICT=="A1"].reset_index(drop=True)
sns.displot(year_2016_A1.groupby(["DAY"])["INCIDENT_NUMBER"].count().values.tolist())
plt.title("Histogram of Daily Incident Numbers for A1 Region")
plt.show()
print("Mean of Daily Unique Incidents for A1 = {}".format(np.mean(year_2016_A1.groupby(["DAY"])["INCIDENT_NUMBER"].count().values.tolist())))
print("Mean of Daily Unique Incidents = {}".format(np.mean(year_2016.groupby(["DAY"])["INCIDENT_NUMBER"].count().values.tolist())))